Load Packages & Functions
source("functions.R")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# We'd like to plot with pretty colors based on national park posters :)
# install.packages("devtools")
#devtools::install_github("katiejolly/nationalparkcolors")
library(nationalparkcolors)
colors4 <- park_palette("Saguaro", 4)
Load Data
# Import the tax data
tax_physeq <- import_gtdbtk_taxonomy_and_checkm(
taxonomy_filename = "Tara_Oceans_Med/TOBG-MED-READCOUNTMATCH.bac120.tsv",
checkm_filename = "Tara_Oceans_Med/TOBG-MED_qa.txt")
## ── Attaching packages ─────────────────────────────────────────── tidyverse 1.2.1.9000 ──
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ✔ purrr 0.3.2
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Import the metadata
meta_physeq <- import_metadata(province_filename = "Tara_Oceans_Med/Sample-Province.tsv",
sizeFraction_filename = "Tara_Oceans_Med/Sample-SizeFraction.tsv") %>%
mutate(names = rownames(.)) %>%
separate(., col = names, into = c("station", "fraction", "depth"), sep = "_") %>%
mutate(r_names = paste(station, fraction, depth, sep = "_")) %>%
column_to_rownames(var = "r_names") %>%
dplyr::select(-c(station, fraction)) %>%
sample_data()
## Warning in class(x) <- c(subclass, tibble_class): Setting class(x) to
## multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4
## object
# Import the readcounts
mag_table <- import_readcounts(readcounts_filename = "Tara_Oceans_Med/TOBG-MED-TOTAL.readcounts")
# Import the tree
mag_tree <- import_tree(tree_filename = "Tara_Oceans_Med/GToTree_output.newick")
# Put into phyloseq object
tara_physeq <- phyloseq(meta_physeq, tax_physeq, mag_table, mag_tree)
Normalization
# Normalizing the matrix
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion
# Read in checkm data to calculate the expected
checkm <- read.csv("Tara_Oceans_Med/TOBG-MED_qa.txt", sep = "\t", as.is = TRUE) %>%
dplyr::select("Bin.Id", "Completeness", "Genome.size..bp.") %>%
dplyr::rename(est_genome_size = "Genome.size..bp.") %>%
# Make a new column for the expected genome size based on est_genome_size * completeness
mutate(exp_genome_size = round(est_genome_size/(Completeness/100)))
ordered_checkm <- data.frame(Bin.Id = rownames(mag_table)) %>%
left_join(checkm, by = "Bin.Id")
## Warning: Column `Bin.Id` joining factor and character vector, coercing into
## character vector
#Sanity check
stopifnot(rownames(mag_table) == ordered_checkm$Bin.Id)
# Do the normalization
t_mat <- t(as.matrix(mag_table))
# divide the matrix columns by the genome size of each MAG
norm_mat <- t_mat/ordered_checkm$exp_genome_size
t_norm_mat <- t(norm_mat)
# Combine into a normalized phyloseq object
tara_norm_physeq <- phyloseq(meta_physeq, tax_physeq, mag_tree,
otu_table(t_norm_mat, taxa_are_rows = TRUE))
Heatmap
heatmap(otu_table(tara_norm_physeq))

# Visualize only the top 50 taxa
top_50MAGs <- names(sort(taxa_sums(tara_norm_physeq), decreasing = TRUE))[1:50]
top_50MAGs_physeq <- prune_taxa(top_50MAGs, tara_norm_physeq)
# Subset only the bacterial samples
bac_top50MAGs <- subset_samples(top_50MAGs_physeq, size_fraction =="bact")
otu_long_50_bact <- psmelt(otu_table(bac_top50MAGs)) %>%
mutate(log_abund = log2(Abundance+0.0000001))
heat_plot <- otu_long_50_bact %>%
ggplot(aes(x=Sample, y=OTU)) +
geom_tile(aes(fill = log_abund)) +
scale_fill_distiller(palette = "YlGnBu") +
labs(title = "Top 50 MAGs") +
theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))
ggplotly(heat_plot)
Beta diversity plots
tara_norm.ord <- ordinate(tara_norm_physeq, method = "PCoA", distance = "bray")
plot_ordination(tara_norm_physeq, tara_norm.ord, color = "province") +
facet_wrap(~size_fraction) +
theme_minimal() +
scale_color_brewer(palette = "Paired")

# Subset the bacterial samples and color by depth
bact <- subset_samples(tara_norm_physeq, size_fraction == "bact")
bact.ord <- ordinate(bact, method = "PCoA", distance = "bray")
plot_ordination(bact, bact.ord, color = "depth")+
theme_minimal() +
geom_point(size = 2) +
scale_color_manual(values = colors4)

Abundance plots
top_taxa <- names(sort(taxa_sums(bact),decreasing = TRUE))[1:4]
top_bact <- prune_taxa(top_taxa, bact)
top_bact_df <- psmelt(top_bact)
abundplot <- ggplot(top_bact_df, aes(x=province, y= Abundance, color = depth))+
facet_wrap(~OTU, scales = "free_y") +
geom_jitter(size = 2) + theme_minimal()+
labs(y = "Normalized MAG Abundance") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
axis.title.x = element_blank()) +
scale_color_manual(values = colors4)
ggplotly(abundplot)